library(ggplot2)
library(reshape2)
library(dplyr)
library(tidyverse)
Urban: Urban/Rural Status (1 if Urban counties, 0 if
Rural counties)PhysicalHealth: Number of days that participant’s
physical health is not good in the past 30 daysMentalHealth: Number of days that participant mental
health is not good in the past 30 daysPhysicalActivity: participant who reported doing
exercise in the past 30 days (1 if did, 0 if didn’t)Asthma: Whether participants experienced asthma (1 if
current, 2 if former, 3 if never)Arthritis: Whether participants diagnosed with some
form of arthritis (1 if diagnosed, 0 if not diagnosed)Sex: (1 if male, 0 if female)Race: (1 if White, 2 if Black, 3 if American Indian or
Alaskan Native, 4 if Asian, 5 if Native Hawaiian or other Pacific
Islander only, 6 if other race, 7 if Multiracial, 8 if Hispanic)Age: (1 if Age 18-24, 2 if Age 25-34, 3 if Age 35-44, 4
if Age 45-54, 5 if Age 55-64, 6 if Age 65+)BMI: Calculated BMI based on Height and Weight.Education: Level of education (1 if didn’t graduate
high school, 2 if graduated high school, 3 if attended college or
technical school, 4 if graduated from college or technical school)Smoke: Levels of smoking (1 if smokes everyday, 2 if
smokes some days, 3 if former smoker, 4 is never smoked)Alcohol: Number of alcoholic beverages consumed per
week.HHADULT: Number of members in householdHIV: Participants tested for HIV (1 if Yes, 0 if
No)Fruit: Total fruits consumed per dayVegetable: Total vegetables consumed per dayCholesterol: Cholesterol is high (1 if Yes, 0 if
No)Stroke: Had a stroke (1 if Yes, 0 if No)MICHD: Participants have reported having myocardial
infarction(MI) or coronary heart disease(CHD) (1 if having, 0 if not
having)Dependent variable: MICHD
Independent variables: all other
michd = read.csv("MICHD.csv")
str(michd)
## 'data.frame': 184877 obs. of 20 variables:
## $ Urban : int 0 1 1 1 1 1 1 1 1 1 ...
## $ PhysicalHealth : num 0 0 0 0 0 ...
## $ MentalHealth : num 0 1.79 0 0 1.61 ...
## $ PhysicalActivity: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Asthma : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Arthritis : int 0 0 0 0 1 0 0 1 0 1 ...
## $ Sex : int 0 0 1 0 0 1 1 1 1 0 ...
## $ Race : int 1 1 1 1 1 4 1 1 1 1 ...
## $ Age : int 2 3 5 5 6 3 6 5 5 4 ...
## $ BMI : num 25.6 29.5 31 35.4 25.7 ...
## $ Education : int 4 4 3 3 2 4 4 4 4 1 ...
## $ Smoke : int 4 4 4 4 4 1 3 4 4 4 ...
## $ Alcohol : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HHADULT : num 0.693 0.693 0.693 0.693 0.693 ...
## $ HIV : int 1 0 0 0 0 0 0 0 0 1 ...
## $ Fruit : num 0.00693 0.00358 0.00405 0.00451 0.01418 ...
## $ Vegetable : num 1.061 0.963 1.051 0.871 1.188 ...
## $ Cholesterol : int 0 0 1 0 1 0 1 1 1 0 ...
## $ Stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MICHD : int 0 0 0 0 0 0 1 0 0 0 ...
col_factor = c("Urban", "PhysicalActivity","Asthma","Arthritis","Sex","Race","Age",
"Education","Alcohol", "Smoke","HIV","Cholesterol", "Stroke", "MICHD")
michd[col_factor] = lapply(michd[col_factor], factor)
str(michd)
## 'data.frame': 184877 obs. of 20 variables:
## $ Urban : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...
## $ PhysicalHealth : num 0 0 0 0 0 ...
## $ MentalHealth : num 0 1.79 0 0 1.61 ...
## $ PhysicalActivity: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Asthma : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 3 3 3 3 3 ...
## $ Arthritis : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 2 ...
## $ Sex : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 2 2 2 1 ...
## $ Race : Factor w/ 8 levels "1","2","3","4",..: 1 1 1 1 1 4 1 1 1 1 ...
## $ Age : Factor w/ 6 levels "1","2","3","4",..: 2 3 5 5 6 3 6 5 5 4 ...
## $ BMI : num 25.6 29.5 31 35.4 25.7 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 4 4 3 3 2 4 4 4 4 1 ...
## $ Smoke : Factor w/ 4 levels "1","2","3","4": 4 4 4 4 4 1 3 4 4 4 ...
## $ Alcohol : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ HHADULT : num 0.693 0.693 0.693 0.693 0.693 ...
## $ HIV : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
## $ Fruit : num 0.00693 0.00358 0.00405 0.00451 0.01418 ...
## $ Vegetable : num 1.061 0.963 1.051 0.871 1.188 ...
## $ Cholesterol : Factor w/ 2 levels "0","1": 1 1 2 1 2 1 2 2 2 1 ...
## $ Stroke : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ MICHD : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
# Density Plot
michd.num <- michd %>% select(c("PhysicalHealth", "MentalHealth", "BMI", "HHADULT", "Fruit", "Vegetable", "MICHD"))
ggplot(melt(michd.num), aes(x = value, fill = MICHD)) +
geom_density(alpha = 0.5) +
facet_wrap( ~ variable, scale = "free")
# Conditional density plots
a <- michd %>%
ggplot() +
geom_histogram(aes(x=PhysicalHealth,
fill=(MICHD==1)),
position='fill') +
theme_bw() +
xlab('Number of Days Physical Health Not Good in past 30 Days') +
ylab('Frequency') +
theme(axis.title=element_text(size=14),
axis.text=element_text(size=14),
legend.text=element_text(size=14)) +
scale_fill_manual(name='',
labels=c('No MICHD', 'Had MICHD'),
values=c('grey', 'red'))
b <- michd %>%
ggplot() +
geom_histogram(aes(x=MentalHealth,
fill=(MICHD==1)),
position='fill') +
theme_bw() +
xlab('Number of Days Mental Health Not Good in past 30 Days') +
ylab('Frequency') +
theme(axis.title=element_text(size=14),
axis.text=element_text(size=14),
legend.text=element_text(size=14)) +
scale_fill_manual(name='',
labels=c('No MICHD', 'Had MICHD'),
values=c('grey', 'red'))
c <- michd %>%
ggplot() +
geom_histogram(aes(x=BMI,
fill=(MICHD==1)),
position='fill') +
theme_bw() +
xlab('BMI') +
ylab('Frequency') +
theme(axis.title=element_text(size=14),
axis.text=element_text(size=14),
legend.text=element_text(size=14)) +
scale_fill_manual(name='',
labels=c('No MICHD', 'Had MICHD'),
values=c('grey', 'red'))
d <- michd %>%
ggplot() +
geom_histogram(aes(x=HHADULT,
fill=(MICHD==1)),
position='fill') +
theme_bw() +
xlab('HHADULT') +
ylab('Frequency') +
theme(axis.title=element_text(size=14),
axis.text=element_text(size=14),
legend.text=element_text(size=14)) +
scale_fill_manual(name='',
labels=c('No MICHD', 'Had MICHD'),
values=c('grey', 'red'))
e <- michd %>%
ggplot() +
geom_histogram(aes(x=Fruit,
fill=(MICHD==1)),
position='fill') +
theme_bw() +
xlab('Total fruits consumed per day') +
ylab('Frequency') +
theme(axis.title=element_text(size=14),
axis.text=element_text(size=14),
legend.text=element_text(size=14)) +
scale_fill_manual(name='',
labels=c('No MICHD', 'Had MICHD'),
values=c('grey', 'red'))
f <- michd %>%
ggplot() +
geom_histogram(aes(x=Vegetable,
fill=(MICHD==1)),
position='fill') +
theme_bw() +
xlab('Total Vegatables consumed per day') +
ylab('Frequency') +
theme(axis.title=element_text(size=14),
axis.text=element_text(size=14),
legend.text=element_text(size=14)) +
scale_fill_manual(name='',
labels=c('No MICHD', 'Had MICHD'),
values=c('grey', 'red'))
library(gridExtra)
grid.arrange(a,b,c,d,e,f, ncol = 2, nrow = 3)
# Select numeric variables
michd.cor = michd %>% select(c("PhysicalHealth", "MentalHealth",
"BMI", "HHADULT", "Fruit", "Vegetable"))
michd.cor$Age.rank <- rank(michd$Age)
michd.cor$Education.rank <- rank(michd$Education)
michd.cor$Smoke.rank <- rank(michd$Smoke)
c = cor(michd.cor, method = "spearman")
library(corrplot)
corrplot(c, addCoef.col = "red", tl.cex = 0.8)
set.seed(123)
michd_matrix_c <- model.matrix(~ . - 1, data = michd[, colnames(michd)])
michd_matrix_c[,'BMI'] <- scale(michd_matrix_c[,'BMI'])
michd_matrix_c[,'HHADULT'] <- scale(michd_matrix_c[,'HHADULT'])
michd_matrix_c[,'Fruit'] <- scale(michd_matrix_c[,'Fruit'])
michd_matrix_c[,'Vegetable'] <- scale(michd_matrix_c[,'Vegetable'])
set.seed(123)
twcv = function(k) kmeans(michd_matrix_c, k, nstart = 25)$tot.withinss
k <- 1:15
twcv_values <- sapply(k, twcv)
plot(twcv_values, type = 'b', pch = 19)
library(factoextra)
k6 <- kmeans(michd_matrix_c, centers = 6, nstart = 25)
fviz_cluster(k6, geom = "point", data = michd_matrix_c)
k8 <- kmeans(michd_matrix_c, centers = 8, nstart = 25)
fviz_cluster(k8, geom = "point", data = michd_matrix_c)
k2 <- kmeans(michd_matrix_c, centers = 2, nstart = 25)
fviz_cluster(k2, geom = "point", data = michd_matrix_c)
pca = prcomp(michd_matrix_c)
fviz_pca_biplot(pca,repel = TRUE, col.var = "red", col.ind = "white", label = c("var"))